Use the Hearth Attack (CaseStudy12_ AdultsHeartAttack) data to:
Identify and impute any missing values
ha<- read.csv("D:/UBD PB/ZH5102 ML/ass2/CaseStudy12_ AdultsHeartAttack_Data.csv",header = T)
ha[ha=="."] <- NA
str(ha)
## 'data.frame': 150 obs. of 8 variables:
## $ Patient : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DIAGNOSIS: int 41041 41041 41091 41081 41091 41091 41091 41091 41041 41041 ...
## $ SEX : chr "F" "F" "F" "F" ...
## $ DRG : int 122 122 122 122 122 121 121 121 121 123 ...
## $ DIED : int 0 0 0 0 0 0 0 0 0 1 ...
## $ CHARGES : chr "4752" "3941" "3657" "1481" ...
## $ LOS : int 10 6 5 2 1 9 15 15 2 1 ...
## $ AGE : int 79 34 76 80 55 84 84 70 76 65 ...
ha$DIAGNOSIS<-as.factor(ha$DIAGNOSIS)
ha$SEX<-as.factor(ha$SEX)
ha$CHARGES<-as.numeric(ha$CHARGES)
hac<-complete.cases(ha); table(hac); prop.table(table(hac))
## hac
## FALSE TRUE
## 2 148
## hac
## FALSE TRUE
## 0.01333333 0.98666667
library(mi)
## Loading required package: Matrix
## Loading required package: stats4
## mi (Version 1.1, packaged: 2022-06-05 05:31:15 UTC; ben)
## mi Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
## This program comes with ABSOLUTELY NO WARRANTY.
## This is free software, and you are welcome to redistribute it
## under the General Public License version 2 or later.
## Execute RShowDoc('COPYING') for details.
mdf<-missing_data.frame(ha)
head(mdf)
## Patient DIAGNOSIS SEX DRG DIED CHARGES LOS AGE missing_CHARGES
## 1 1 41041 F 122 0 4752 10 79 FALSE
## 2 2 41041 F 122 0 3941 6 34 FALSE
## 3 3 41091 F 122 0 3657 5 76 FALSE
## 4 4 41081 F 122 0 1481 2 80 FALSE
## 5 5 41091 M 122 0 1681 1 55 FALSE
## 6 6 41091 M 121 0 6379 9 84 FALSE
show(mdf)
## Object of class missing_data.frame with 150 observations on 8 variables
##
## There are 2 missing data patterns
##
## Append '@patterns' to this missing_data.frame to access the corresponding pattern for every observation or perhaps use table()
##
## type missing method model
## Patient continuous 0 <NA> <NA>
## DIAGNOSIS unordered-categorical 0 <NA> <NA>
## SEX binary 0 <NA> <NA>
## DRG ordered-categorical 0 <NA> <NA>
## DIED binary 0 <NA> <NA>
## CHARGES continuous 2 ppd linear
## LOS continuous 0 <NA> <NA>
## AGE continuous 0 <NA> <NA>
##
## family link transformation
## Patient <NA> <NA> standardize
## DIAGNOSIS <NA> <NA> <NA>
## SEX <NA> <NA> <NA>
## DRG <NA> <NA> <NA>
## DIED <NA> <NA> <NA>
## CHARGES gaussian identity standardize
## LOS <NA> <NA> standardize
## AGE <NA> <NA> standardize
mdf<-change(mdf, y=c("CHARGES") , what = "imputation_method", to="pmm")
imputations<-mi(mdf, n.iter=10, n.chains=3, verbose=T)
haim <- complete(imputations, 3)
haim<-haim$`chain:3`
haim$CHARGES <-scale(haim$CHARGES)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
haim<-haim%>%mutate_at(c(2,3,4,5,9),as.integer)
str(haim)
## 'data.frame': 150 obs. of 9 variables:
## $ Patient : num 1 2 3 4 5 6 7 8 9 10 ...
## $ DIAGNOSIS : int 4 4 7 6 7 7 7 7 4 4 ...
## $ SEX : int 1 1 1 1 2 2 1 1 2 1 ...
## $ DRG : int 2 2 2 2 2 1 1 1 1 3 ...
## $ DIED : int 1 1 1 1 1 1 1 1 1 2 ...
## $ CHARGES : num [1:150, 1] -0.207 -0.443 -0.526 -1.158 -1.1 ...
## ..- attr(*, "scaled:center")= num 5465
## ..- attr(*, "scaled:scale")= num 3440
## $ LOS : num 10 6 5 2 1 9 15 15 2 1 ...
## $ AGE : num 79 34 76 80 55 84 84 70 76 65 ...
## $ missing_CHARGES: int 0 0 0 0 0 0 0 0 0 0 ...Use the DIAGNOSIS as a clinically relevant outcome
variable
Y <- haim$DIAGNOSIS
table(Y)
## Y
## 1 2 3 4 5 6 7
## 2 32 1 55 16 1 43
str(Y)
## int [1:150] 4 4 7 6 7 7 7 7 4 4 ...
summary(Y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 4.000 4.507 7.000 7.000
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
h <- hist(Y, breaks='FD', plot = F)
plot_ly(x = h$mids, y = h$density, type = "bar") %>%
layout(bargap=0.1, title="Histogram of Computed Variable Y = DIAGNOSIS")
h <- hist(log(Y), breaks='FD', plot = F)
plot_ly(x = h$mids, y = h$density, type = "bar") %>%
layout(bargap=0.1, title="Histogram of log(Y)")
Y <- log(haim$DIAGNOSIS)
X <- as.matrix(haim[ , -c(1:2,5,9)])
str(X)
## num [1:150, 1:5] 1 1 1 1 2 2 1 1 2 1 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:5] "SEX" "DRG" "CHARGES" "LOS" ...
summary(X)
## SEX DRG CHARGES LOS
## Min. :1.0 Min. :1.000 Min. :-1.3849 Min. : 1.000
## 1st Qu.:1.0 1st Qu.:1.000 1st Qu.:-0.8177 1st Qu.: 1.000
## Median :2.0 Median :2.000 Median :-0.2204 Median : 3.000
## Mean :1.6 Mean :1.753 Mean : 0.0000 Mean : 4.167
## 3rd Qu.:2.0 3rd Qu.:2.000 3rd Qu.: 0.6819 3rd Qu.: 6.000
## Max. :2.0 Max. :3.000 Max. : 3.3928 Max. :28.000
## AGE
## Min. :34.00
## 1st Qu.:56.25
## Median :68.00
## Mean :66.33
## 3rd Qu.:76.00
## Max. :98.00Randomly split the data into training (70%) and testing (30%) sets
set.seed(2023)
train = sample(1 : nrow(X), round((3.5/5)* nrow(X)))
test = -train
# subset training data
YTrain = Y[train]
XTrain = X[train, ]
XTrainOLS = cbind(rep(1, nrow(XTrain)), XTrain)
# subset test data
YTest = Y[test]
XTest = X[test, ]Use the LASSO model to standardize the predictors and report the model results
library(glmnet)
## Loaded glmnet 4.1-7
# glmnet automatically standardizes the predictors
fitOLS = lm(YTrain ~ XTrain)
fitRidge = glmnet(XTrain, YTrain, alpha = 0)
fitLASSO = glmnet(XTrain, YTrain, alpha = 1) # The LASSO
fitLASSO
##
## Call: glmnet(x = XTrain, y = YTrain, alpha = 1)
##
## Df %Dev Lambda
## 1 0 0.00 0.156100
## 2 1 2.02 0.142200
## 3 1 3.69 0.129600
## 4 1 5.08 0.118100
## 5 1 6.24 0.107600
## 6 1 7.20 0.098020
## 7 1 7.99 0.089310
## 8 2 8.73 0.081380
## 9 2 9.79 0.074150
## 10 2 10.67 0.067560
## 11 2 11.40 0.061560
## 12 2 12.01 0.056090
## 13 2 12.51 0.051110
## 14 2 12.93 0.046570
## 15 2 13.28 0.042430
## 16 2 13.56 0.038660
## 17 2 13.80 0.035230
## 18 3 14.07 0.032100
## 19 3 14.34 0.029250
## 20 4 14.67 0.026650
## 21 4 14.98 0.024280
## 22 4 15.24 0.022120
## 23 4 15.45 0.020160
## 24 4 15.63 0.018370
## 25 4 15.78 0.016740
## 26 5 15.92 0.015250
## 27 5 16.06 0.013890
## 28 5 16.17 0.012660
## 29 5 16.27 0.011540
## 30 5 16.35 0.010510
## 31 5 16.42 0.009577
## 32 5 16.48 0.008726
## 33 5 16.52 0.007951
## 34 5 16.56 0.007244
## 35 5 16.59 0.006601
## 36 5 16.62 0.006014
## 37 5 16.64 0.005480
## 38 5 16.66 0.004993
## 39 5 16.67 0.004550
## 40 5 16.69 0.004146
## 41 5 16.70 0.003777
## 42 5 16.71 0.003442
## 43 5 16.71 0.003136
## 44 5 16.72 0.002857
## 45 5 16.72 0.002603
## 46 5 16.73 0.002372
## 47 5 16.73 0.002161
## 48 5 16.74 0.001969
## 49 5 16.74 0.001794
## 50 5 16.74 0.001635
## 51 5 16.74 0.001490
## 52 5 16.74 0.001357
## 53 5 16.74 0.001237
## 54 5 16.74 0.001127
## 55 5 16.75 0.001027
## 56 5 16.75 0.000936
## 57 5 16.75 0.000852
## 58 5 16.75 0.000777
## 59 5 16.75 0.000708
## 60 5 16.75 0.000645
## 61 5 16.75 0.000588
## 62 5 16.75 0.000535
## 63 5 16.75 0.000488
## 64 5 16.75 0.000445
library(RColorBrewer)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:mi':
##
## complete
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
library(plotly)
### Plot Solution Path ###
plot.glmnet <- function(glmnet.object, name="") {
df <- as.data.frame(t(as.matrix(glmnet.object$beta)))
df$loglambda <- log(glmnet.object$lambda)
df <- as.data.frame(df)
data_long <- gather(df, Variable, coefficient, 1:(dim(df)[2]-1), factor_key=TRUE)
plot_ly(data = data_long) %>%
# add error bars for each CV-mean at log(lambda)
add_trace(x = ~loglambda, y = ~coefficient, color=~Variable,
colors=colorRampPalette(brewer.pal(10,"Spectral"))(dim(df)[2]), # "Dark2",
type = 'scatter', mode = 'lines',
name = ~Variable) %>%
layout(title = paste0(name, " Model Coefficient Values"),
xaxis = list(title = paste0("log(",TeX("\\lambda"),")"), side="bottom", showgrid = TRUE),
hovermode = "x unified", legend = list(orientation='v'),
yaxis = list(title = ' Model Coefficient Values', side="left", showgrid = TRUE))
}
plot.glmnet(fitLASSO, name="LASSO")
### DRG & CHARGES have high coefficient value and collapse close to zero. These indicate DRG & CHARGES are the two predicted variables to be selected. Optimize the choice of the regularization parameter
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(glmnet)
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
cvridge <- cv.glmnet(XTrain, YTrain, alpha = 0, thresh = 1e-12, parallel = T)
lambda.bestr <- cvridge$lambda.min
lambda.bestr
## [1] 0.1597482
ridge.pred <- predict(cvridge, newx = XTest, s = lambda.bestr)
ridge.RMS <- mean((YTest - ridge.pred)^2); ridge.RMS
## [1] 0.322462
ridge.test.r2 <- 1 - mean((YTest - ridge.pred)^2)/mean((YTest - mean(YTest))^2)
###
cvlasso <- cv.glmnet(XTrain, YTrain, alpha = 1, thresh = 1e-12, parallel = T)
lambda.bestl <- cvlasso$lambda.min
lambda.bestl
## [1] 0.04243053
lasso.pred <- predict(cvlasso, newx = XTest, s = lambda.bestl)
lasso.RMS <- mean((YTest - lasso.pred)^2); lasso.RMS
## [1] 0.3038537
lasso.test.r2 <- 1 - mean((YTest - lasso.pred)^2)/mean((YTest - mean(YTest))^2)
###
cvel <- cv.glmnet(XTrain, YTrain, alpha = .4, thresh = 1e-12, parallel = T)
lambda.beste <- cvel$lambda.min
lambda.beste
## [1] 0.08806642
el.pred <- predict(cvel, newx = XTest, s = lambda.beste)
el.RMS <- mean((YTest - el.pred)^2); el.RMS
## [1] 0.3016407
el.test.r2 <- 1 - mean((YTest - el.pred)^2)/mean((YTest - mean(YTest))^2)
# barplot(c(lm.test.r2, lasso.test.r2, ridge.test.r2), col = "red", names.arg = c("OLS", "LASSO", "Ridge"), main = "Testing Data Derived R-squared")
plot_ly(x = c("EL", "LASSO", "Ridge"), y = c(el.test.r2, lasso.test.r2, ridge.test.r2),
name = paste0("Model ", TeX("R^2") ," Performance"), type = "bar") %>%
layout(title=paste0("Model ", TeX("R^2") ," Performance"))
library(knitr)
RMS_Table = data.frame(EL=el.RMS, LASSO=lasso.RMS, Ridge=ridge.RMS)
kable(RMS_Table, format="pandoc", caption="Test Dataset RSS Results", align=c("c", "c", "c"))
| EL | LASSO | Ridge |
|---|---|---|
| 0.3016407 | 0.3038537 | 0.322462 |
stopCluster(cl)Apply cross validation to report internal statistical validity of the model
plotCV.glmnet <- function(cv.glmnet.object, name="") {
df <- as.data.frame(cbind(x=log(cv.glmnet.object$lambda), y=cv.glmnet.object$cvm,
errorBar=cv.glmnet.object$cvsd), nzero=cv.glmnet.object$nzero)
featureNum <- cv.glmnet.object$nzero
xFeature <- log(cv.glmnet.object$lambda)
yFeature <- max(cv.glmnet.object$cvm)+max(cv.glmnet.object$cvsd)
dataFeature <- data.frame(featureNum, xFeature, yFeature)
plot_ly(data = df) %>%
# add error bars for each CV-mean at log(lambda)
add_trace(x = ~x, y = ~y, type = 'scatter', mode = 'markers',
name = 'CV MSE', error_y = ~list(array = errorBar)) %>%
# add the lambda-min and lambda 1SD vertical dash lines
add_lines(data=df, x=c(log(cv.glmnet.object$lambda.min), log(cv.glmnet.object$lambda.min)),
y=c(min(cv.glmnet.object$cvm)-max(df$errorBar), max(cv.glmnet.object$cvm)+max(df$errorBar)),
showlegend=F, line=list(dash="dash"), name="lambda.min", mode = 'lines+markers') %>%
add_lines(data=df, x=c(log(cv.glmnet.object$lambda.1se), log(cv.glmnet.object$lambda.1se)),
y=c(min(cv.glmnet.object$cvm)-max(df$errorBar), max(cv.glmnet.object$cvm)+max(df$errorBar)),
showlegend=F, line=list(dash="dash"), name="lambda.1se") %>%
# Add Number of Features Annotations on Top
add_trace(dataFeature, x = ~xFeature, y = ~yFeature, type = 'scatter', name="Number of Features",
mode = 'text', text = ~featureNum, textposition = 'middle right',
textfont = list(color = '#000000', size = 9)) %>%
# Add top x-axis (non-zero features)
# add_trace(data=df, x=~c(min(cv.glmnet.object$nzero),max(cv.glmnet.object$nzero)),
# y=~c(max(y)+max(errorBar),max(y)+max(errorBar)), showlegend=F,
# name = "Non-Zero Features", yaxis = "ax", mode = "lines+markers", type = "scatter") %>%
layout(title = paste0("Cross-Validation MSE (", name, ")"),
xaxis = list(title=paste0("log(",TeX("\\lambda"),")"), side="bottom", showgrid=TRUE), # type="log"
hovermode = "x unified", legend = list(orientation='h'), # xaxis2 = ax,
yaxis = list(title = cv.glmnet.object$name, side="left", showgrid = TRUE))
}
#plot(cv.glmmod)
cv.glmmodr <- cv.glmnet(as.matrix(XTrain), as.matrix(YTrain), alpha=0, Parallel=T)
plotCV.glmnet(cv.glmmodr, "Ridge")
predRidge <- predict(cv.glmmodr, s = cv.glmmodr$lambda.1se, newx = as.matrix(XTest))
testMSE_Ridge <- mean((predRidge - YTest)^2); testMSE_Ridge
## [1] 0.2717447
plotCV.glmnet <- function(cv.glmnet.object, name="") {
df <- as.data.frame(cbind(x=log(cv.glmnet.object$lambda), y=cv.glmnet.object$cvm,
errorBar=cv.glmnet.object$cvsd), nzero=cv.glmnet.object$nzero)
featureNum <- cv.glmnet.object$nzero
xFeature <- log(cv.glmnet.object$lambda)
yFeature <- max(cv.glmnet.object$cvm)+max(cv.glmnet.object$cvsd)
dataFeature <- data.frame(featureNum, xFeature, yFeature)
plot_ly(data = df) %>%
# add error bars for each CV-mean at log(lambda)
add_trace(x = ~x, y = ~y, type = 'scatter', mode = 'markers',
name = 'CV MSE', error_y = ~list(array = errorBar)) %>%
# add the lambda-min and lambda 1SD vertical dash lines
add_lines(data=df, x=c(log(cv.glmnet.object$lambda.min), log(cv.glmnet.object$lambda.min)),
y=c(min(cv.glmnet.object$cvm)-max(df$errorBar), max(cv.glmnet.object$cvm)+max(df$errorBar)),
showlegend=F, line=list(dash="dash"), name="lambda.min", mode = 'lines+markers') %>%
add_lines(data=df, x=c(log(cv.glmnet.object$lambda.1se), log(cv.glmnet.object$lambda.1se)),
y=c(min(cv.glmnet.object$cvm)-max(df$errorBar), max(cv.glmnet.object$cvm)+max(df$errorBar)),
showlegend=F, line=list(dash="dash"), name="lambda.1se") %>%
# Add Number of Features Annotations on Top
add_trace(dataFeature, x = ~xFeature, y = ~yFeature, type = 'scatter', name="Number of Features",
mode = 'text', text = ~featureNum, textposition = 'middle right',
textfont = list(color = '#000000', size = 9)) %>%
# Add top x-axis (non-zero features)
# add_trace(data=df, x=~c(min(cv.glmnet.object$nzero),max(cv.glmnet.object$nzero)),
# y=~c(max(y)+max(errorBar),max(y)+max(errorBar)), showlegend=F,
# name = "Non-Zero Features", yaxis = "ax", mode = "lines+markers", type = "scatter") %>%
layout(title = paste0("Cross-Validation MSE (", name, ")"),
xaxis = list(title=paste0("log(",TeX("\\lambda"),")"), side="bottom", showgrid=TRUE), # type="log"
hovermode = "x unified", legend = list(orientation='h'), # xaxis2 = ax,
yaxis = list(title = cv.glmnet.object$name, side="left", showgrid = TRUE))
}
cv.glmmodl <- cv.glmnet(as.matrix(XTrain), as.matrix(YTrain), alpha=1, Parallel=T)
plotCV.glmnet(cv.glmmodl, "LASSO")
predLASSO <- predict(cv.glmmodl, s = cv.glmmodl$lambda.1se, newx = as.matrix(XTest))
testMSE_LASSO <- mean((predLASSO - YTest)^2); testMSE_LASSO
## [1] 0.2717447Report and compare the OLS, Stepwise OLS with AIC, Ridge and LASSO coefficient estimates
dt = as.data.frame(cbind(YTrain,XTrain))
ols_step <- lm(YTrain ~., data = dt)
ols_step <- step(ols_step, direction = 'both', k=2, trace = F)
summary(ols_step)
##
## Call:
## lm(formula = YTrain ~ DRG + CHARGES, data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.89481 -0.27436 0.03112 0.33422 0.80332
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.80584 0.12247 14.745 < 2e-16 ***
## DRG -0.24357 0.06575 -3.705 0.000344 ***
## CHARGES -0.08232 0.04275 -1.926 0.056938 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4236 on 102 degrees of freedom
## Multiple R-squared: 0.1497, Adjusted R-squared: 0.1331
## F-statistic: 8.981 on 2 and 102 DF, p-value: 0.0002555
betaHatOLS_step = ols_step$coefficients
var_step <- colnames(ols_step$model)[-1]
XTestOLS_step = cbind(rep(1, nrow(XTest)), XTest[,var_step])
predOLS_step = as.matrix(XTestOLS_step)%*%as.matrix(betaHatOLS_step)
testMSEOLS_step = mean((predOLS_step - YTest)^2)
# Report MSE OLS Stepwise feature selection
testMSEOLS_step
## [1] 0.3219908
pred2 <- predict(ols_step,as.data.frame(XTest))
any(pred2 == predOLS_step)
## [1] TRUE
# OLS coefficient estimates
betaHatOLS = fitOLS$coefficients
# LASSO coefficient estimates
betaHatLASSO = as.double(coef(fitLASSO, s = cvlasso$lambda.1se)) # s is lambda
# Ridge coefficient estimates
betaHatRidge = as.double(coef(fitRidge, s = cvridge$lambda.1se))
# Test Set MSE
# calculate predicted values
XTestOLS = cbind(rep(1, nrow(XTest)), XTest) # add intercept to test data
predOLS = as.matrix(XTestOLS)%*%betaHatOLS
predLASSO = predict(fitLASSO, s = cvlasso$lambda.1se, newx = XTest)
predRidge = predict(fitRidge, s = cvridge$lambda.1se, newx = XTest)
# calculate test set MSE
testMSEOLS = mean((predOLS - YTest)^2)
testMSELASSO = mean((predLASSO - YTest)^2)
testMSERidge = mean((predRidge - YTest)^2)
df <- as.data.frame(cbind(Feature=attributes(betaHatOLS)$names, OLS=betaHatOLS,
LASSO=betaHatLASSO, Ridge=betaHatRidge))
df1<- as.data.frame(cbind(Feature=attributes(betaHatOLS)$names[c(1,3,4)], Stepwise=betaHatOLS_step[1:3]))
df2<-merge(df, df1,all=TRUE)
df2
## Feature OLS LASSO Ridge
## 1 (Intercept) 2.11494670818208 1.38197848487707 1.38197848487707
## 2 XTrainAGE -0.00440266703106079 0 -2.36455082515988e-39
## 3 XTrainCHARGES -0.0942239130440304 0 -8.88444765411039e-38
## 4 XTrainDRG -0.238310575218313 0 -2.50580178241008e-37
## 5 XTrainLOS 0.013029076493701 0 1.15164707612166e-38
## 6 XTrainSEX -0.0491741918441361 0 2.7830867750579e-38
## Stepwise
## 1 1.80584341116096
## 2 <NA>
## 3 -0.0823241309163229
## 4 -0.24357216655891
## 5 <NA>
## 6 <NA>
data_long <- gather(df2, Method, value, OLS:Stepwise, factor_key=TRUE)
data_long$value <- as.numeric(data_long$value)
library(plotly)
plot_ly(data_long, x=~value, y=~Feature, type="scatter", mode="markers",
marker=list(size=20), color=~Method, symbol=~Method, symbols=c('circle-open','x-open','hexagon-open'))
## Warning: Ignoring 3 observations
Calculate the predicted values for all 4 models and report the models performance
MSETable = data.frame(OLS=testMSEOLS, OLS_step=testMSEOLS_step, LASSO=testMSELASSO, Ridge=testMSERidge)
kable(MSETable, format="pandoc", caption="Test Set MSE", align=c("c", "c", "c", "c"))
| OLS | OLS_step | LASSO | Ridge |
|---|---|---|---|
| 0.352621 | 0.3219908 | 0.2717447 | 0.2717447 |
Apply knockoff filtering to control the false variable selection rate
library(knockoff)
result = knockoff.filter(X, Y, fdr=0.2, knockoffs= create.second_order)
print(result$selected)
## named integer(0)
knockoff_selected <- names(result$selected)
knockoff_selected
## character(0)
### No variable selected using knockoff filteringCompare the variables selected by Stepwise OLS, LASSO and knockoff
var_step = names(ols_step$coefficients)[-1]
var_lasso = colnames(XTrain)[which(coef(fitLASSO, s = cvlasso$lambda.min)!=0)-1]
var_step
## [1] "DRG" "CHARGES"
var_lasso
## [1] "DRG" "CHARGES"
knockoff_selected
## character(0)
list(Knockoff = knockoff_selected, Step = var_step, LASSO = var_lasso)
## $Knockoff
## character(0)
##
## $Step
## [1] "DRG" "CHARGES"
##
## $LASSO
## [1] "DRG" "CHARGES"
##Stepwise share the same variable with LASSO. No variable selected by Knockoff.Thus, no common feature between Step and LASSo against Knockoff.Apply Bootstrap LASSO and knockoff, and compare the results.
registerDoSEQ()
unregister_dopar <- function() {
env <- foreach:::.foreachGlobals
rm(list=ls(name=env), pos=env)
}
unregister_dopar()
mat = matrix(0,ncol = 27)
N = 400
sca.x = as.matrix(X)
for (i in 1:N){
#if (i%%50==0){
#print(i)
#}
tp = sample(1:nrow(X),0.5*nrow(X))
x.tp = sca.x[tp,]
y.tp = Y[tp]
cvtp = cv.glmnet(x.tp, y.tp, alpha = 1, parallel = T)
tt <- as.matrix(coef(cvtp, s = "lambda.min"))
mat[which(tt!=0)]=mat[which(tt!=0)]+1
}
## Warning: executing %dopar% sequentially: no parallel backend registered
df <- as.data.frame(t(mat))
df$id <- 1:27
varname = colnames(X)[1:26]
df$names <- c("intercept",varname)
ddf <- df[order(df$V1,decreasing = T),]
ddf$fre = ddf$V1/N
ddf[2:6,c(3,4)] # 1 for intercept
## names fre
## 4 CHARGES 0.2750
## 3 DRG 0.2725
## 5 LOS 0.1625
## 6 AGE 0.1000
## 2 SEX 0.0900
N=1000
num = matrix(0,ncol = 27)
for (i in 1:N){
#if (i%%50==0){
#print(i)
#}
tp = sample(1:nrow(X),0.5*nrow(X))
x.tp = sca.x[tp,]
y.tp = Y[tp]
tt <- knockoff.filter(X = x.tp, y = y.tp,fdr=0.2)
tt <- as.matrix(tt$selected)
num[tt]=num[tt]+1
}
df2 <- as.data.frame(t(num))
df2$id <- 1:27
df2$names <- c("intercept",varname)
ddf2 <- df2[order(df2$V1,decreasing = T),]
ddf2$fre = ddf2$V1/N
ddf2[2:6,c(3,4)]
## names fre
## 2 SEX 0.004
## 3 DRG 0.004
## 4 CHARGES 0.004
## 5 LOS 0.004
## 6 AGE 0.000
##All variables are found to have equal frequency of zero by Knockoff filter, concluded to have no variable selected. Frequency of DRG & CHARGES are high, consistenst with the selected variables of LASSO.